About

Retrieve JSON documents which are accessible via REST API and store them in mongodb.

Prerequesites

  • A running mongodb instance to store the JSON documents (see docker container).
  • Python libraries:
    • pymongo - python bindings for mongodb.
    • jsonpath_rw - path expressions for matching parts of a JSON document.
  • A configuration file (config.yaml) with setting for
    • mongodb instance and collection name
    • REST API url and parameters
    • JSON document format

In [ ]:
from pymongo import MongoClient
from urllib import urlopen
from jsonpath_rw import jsonpath, parse
from datetime import datetime
import json
import yaml

Prepare configuration settings


In [ ]:
with open('config.yaml') as yaml_file:
    cfg = yaml.load(yaml_file)

rest_api = cfg['rest-api']
json_cfg = cfg['json-path']
mongo_db = cfg['mongo']

api_list_items  = rest_api['url'] + rest_api['get_list']
api_get_item    = rest_api['url'] + rest_api['get_detail']

item_id_field  = cfg['json-path']['item_id']
item_list_path = parse(cfg['json-path']['item_list'])

Prepare database connection


In [ ]:
client = MongoClient(mongo_db['url'])

db = client[mongo_db['database']][mongo_db['collection']]

print "%d entries in database." % db.find().count()

Fetching documents via REST API


In [ ]:
# functions for REST API calls

def get_item_list(offset = 0, limit = 100, url = api_list_items):
    request = urlopen(url % (offset, limit))
    return json.loads(request.read())

def get_item(id, url = api_get_item):
    request = urlopen(url % id)
    return json.loads(request.read())

In [ ]:
max_items = 10**6
limit=100

for offset in xrange(0, max_items, limit):
    print ("%s - fetching items %s - %s" % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), offset, offset+limit))
    
    # download item list
    result    = get_item_list(offset=offset, limit=limit)
    item_list = [ item.value for item in item_list_path.find(result) ]
    
    # stop if result list is empty
    if len(item_list) == 0:
        print "no more results returned"
        break
    
    # extract IDs and compare with items already in database
    item_ids  = [ item[item_id_field] for item in item_list ]
    known_ids = [ item[item_id_field] for item in db.find( {item_id_field: { "$in": item_ids }} ) ]
    new_ids   = [ x for x in item_ids if x not in known_ids ]
    
    print "-> got %d ids (%d known, %d new)" % (len(item_ids), len(known_ids), len(new_ids))
    
    # fetch new items from REST API
    items = []
    for id in new_ids:
        item = get_item(id)
        items.append(item)
    
    # insert new items in database
    if len(items) != 0:
        result = db.insert_many(items)